RA Bootcamp Warmup

Analysis

Author

Xihang ZOU

Published

August 17, 2024

パッケージの読み込み

#pacmanをインストール
if (!require(pacman))  {
  install.packages("pacman")
}

#tidyvreseをロード
pacman::p_load(readxl, tidyverse, here, kableExtra, gt, modelsummary)

1 記述統計

Masterの読み込み

#csvの読み込み
master <- read_csv(here("master.csv"))
# A tibble: 6 × 23
  unitid instnm             semester quarter  year yearofsem after totcohortsize
   <dbl> <chr>                 <dbl>   <dbl> <dbl>     <dbl> <dbl>         <dbl>
1 100654 ALABAMA A&M UNIVE…        1       0  1991        NA    NA          1010
2 100654 ALABAMA A&M UNIVE…        1       0  1992        NA    NA           876
3 100654 ALABAMA A & M UNI…        1       0  1993        NA    NA          1019
4 100654 ALABAMA A & M UNI…        1       0  1995        NA    NA           849
5 100654 ALABAMA A & M UNI…        1       0  1996        NA    NA           716
6 100654 ALABAMA A & M UNI…        1       0  1997        NA    NA           789
# ℹ 15 more variables: w_cohortsize <dbl>, m_cohortsize <dbl>,
#   tot4yrgrads <dbl>, m_4yrgrads <dbl>, w_4yrgrads <dbl>,
#   women_gradrate_4yr <dbl>, womengradrate4yr <dbl>, gradrate4yr <dbl>,
#   mengradrate4yr <dbl>, instatetuition <dbl>, costs <dbl>, faculty <dbl>,
#   white_cohortsize <dbl>, per_white_cohort <dbl>, per_women_cohort <dbl>

1.1 各列に含まれるNAの数を数える

#列ごとのNAをカウント
numbers_NA <- as.data.frame(sapply(master, function(x) sum(is.na(x)))) |> 
  rename("number_NA" = "sapply(master, function(x) sum(is.na(x)))")

#行と列を入れ替え
numbers_NA <- as.data.frame(t(numbers_NA))
# A tibble: 1 × 23
  unitid instnm semester quarter  year yearofsem after totcohortsize
   <int>  <int>    <int>   <int> <int>     <int> <int>         <int>
1      0      0        0       0     0     12844 12844             0
# ℹ 15 more variables: w_cohortsize <int>, m_cohortsize <int>,
#   tot4yrgrads <int>, m_4yrgrads <int>, w_4yrgrads <int>,
#   women_gradrate_4yr <int>, womengradrate4yr <int>, gradrate4yr <int>,
#   mengradrate4yr <int>, instatetuition <int>, costs <int>, faculty <int>,
#   white_cohortsize <int>, per_white_cohort <int>, per_women_cohort <int>

1.2 要約統計の作成

#要約統計を作成
summary_master <- master |> 
  group_by(switcher = if_else(!is.na(after), "Switchers", "Never switchers")) |>  
  summarise(
    `Semester calendar` = paste0(round(mean(semester, na.rm = TRUE), 2), " (", round(sd(semester, na.rm = TRUE), 2), ")"),
    `Four-year graduation rate` = paste0(round(mean(gradrate4yr, na.rm = TRUE), 2), " (", round(sd(gradrate4yr, na.rm = TRUE), 2), ")"),
    `Four-year women graduation rate` = paste0(round(mean(womengradrate4yr, na.rm = TRUE), 2), " (", round(sd(womengradrate4yr, na.rm = TRUE), 2), ")"),
    `Four-year men graduation rate` = paste0(round(mean(mengradrate4yr, na.rm = TRUE), 2), " (", round(sd(mengradrate4yr, na.rm = TRUE), 2), ")"),
    `Cohort size` = paste0(round(mean(totcohortsize, na.rm = TRUE), 2), " (", round(sd(totcohortsize, na.rm = TRUE), 2), ")"),
    `Full-time-equivalent faculty` = paste0(round(mean(faculty, na.rm = TRUE), 2), " (", round(sd(faculty, na.rm = TRUE), 2), ")"),
    `In-state tuition` = paste0(round(mean(instatetuition, na.rm = TRUE), 2), " (", round(sd(instatetuition, na.rm = TRUE), 2), ")")
  ) |> 
  bind_rows(master |> 
              summarise(
                 `Semester calendar` = paste0(round(mean(semester, na.rm = TRUE), 2), " (", round(sd(semester, na.rm = TRUE), 2), ")"),
    `Four-year graduation rate` = paste0(round(mean(gradrate4yr, na.rm = TRUE), 2), " (", round(sd(gradrate4yr, na.rm = TRUE), 2), ")"),
    `Four-year women graduation rate` = paste0(round(mean(womengradrate4yr, na.rm = TRUE), 2), " (", round(sd(womengradrate4yr, na.rm = TRUE), 2), ")"),
    `Four-year men graduation rate` = paste0(round(mean(mengradrate4yr, na.rm = TRUE), 2), " (", round(sd(mengradrate4yr, na.rm = TRUE), 2), ")"),
    `Cohort size` = paste0(round(mean(totcohortsize, na.rm = TRUE), 2), " (", round(sd(totcohortsize, na.rm = TRUE), 2), ")"),
    `Full-time-equivalent faculty` = paste0(round(mean(faculty, na.rm = TRUE), 2), " (", round(sd(faculty, na.rm = TRUE), 2), ")"),
    `In-state tuition` = paste0(round(mean(instatetuition, na.rm = TRUE), 2), " (", round(sd(instatetuition, na.rm = TRUE), 2), ")")
              ) |>  mutate(switcher = "All")
  ) |> 
  arrange(match(switcher, c("All", "Never switchers", "Switchers")))

#行と列を入れ替え
summary_master <- as.data.frame(t(summary_master))

#列名を変更
colnames(summary_master) <- summary_master[1, ]
summary_master <- summary_master[-1, ]

#表を作成
summary_master |> 
  kbl(caption = "Table 1—Institution-Level Summary Statistics", format = "html") |> 
  kable_styling(bootstrap_options = "condensed",
                full_width = FALSE, 
                font_size = 14, 
                position = "center",
                html_font = "Times New Roman") |> 
  column_spec(1, width = "200px") |> 
  column_spec(2:4, width = "110px", extra_css = "text-align: center;") |>
  row_spec(0, bold = TRUE, extra_css = "text-align: center; vertical-align: middle;", 
           hline_after = TRUE) |> 
  row_spec(1:nrow(summary_master), extra_css = "height: 40px; vertical-align: middle;") |> 
  footnote(general = "The balanced panel dataset includes the 1991–2010 entering cohorts. There are 731 institutions and 19 years. An observation is an institution year. Standard deviations are reported in parentheses.",
           general_title = "Notes:", 
           footnote_as_chunk = TRUE)
Table 1—Institution-Level Summary Statistics
All Never switchers Switchers
Semester calendar 0.93 (0.25) 0.95 (0.22) 0.7 (0.46)
Four-year graduation rate 0.37 (0.23) 0.38 (0.23) 0.27 (0.18)
Four-year women graduation rate 0.41 (0.23) 0.42 (0.23) 0.32 (0.2)
Four-year men graduation rate 0.32 (0.23) 0.33 (0.23) 0.22 (0.18)
Cohort size 1099.45 (1183.03) 1084.86 (1170.03) 1278.78 (1319.97)
Full-time-equivalent faculty 340 (382.59) 335.03 (377.78) 401.04 (432.91)
In-state tuition 11088.47 (9181.55) 11375.81 (9238.61) 7556.8 (7612.64)
Notes: The balanced panel dataset includes the 1991–2010 entering cohorts. There are 731 institutions and 19 years. An observation is an institution year. Standard deviations are reported in parentheses.

1.3 4年卒業率の平均推移をプロット

#4年卒業率を計算
summary_semesterrate <- master |> 
  group_by(year) |> 
  summarize(fraction_on_semesters = mean(semester, na.rm = TRUE))


# グラフをプロット
summary_semesterrate |> 
ggplot(aes(x = year)) +
  geom_line(aes(y = fraction_on_semesters), color = "black", size = 0.5) +
  scale_y_continuous(
    name = "4-year graduation rate",
    limits = c(0.8, 1)
  ) +
  labs(
    title = "Figure 1. Four-Year Graduation Rates",
    x = "Year"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(hjust = 0.5, family = "serif"),
    panel.grid = element_blank(),
    axis.line = element_line(),
    axis.ticks = element_line(),
    axis.title.y.left = element_text(margin = margin(r = 10)),  
    axis.title.y.right = element_text(margin = margin(l = 10))
  )

1.4 semester制導入率の平均推移をプロット

#4年卒業率を計算
summary_gradrate4yr <- master |> 
  group_by(year) |> 
  summarize(avg_gradrate4yr = mean(gradrate4yr, na.rm = TRUE))


# グラフをプロット
summary_gradrate4yr |> 
ggplot(aes(x = year)) +
  geom_line(aes(y = avg_gradrate4yr), color = "black", size = 0.5) +
  scale_y_continuous(
    name = "Fraction of schools on semesters",
    limits = c(0.25, 0.45)
  ) +
  labs(
    title = "Figure 2. Fraction of Schools on Semesters",
    x = "Year"
  ) +
  theme_minimal()+
  theme(
    plot.title = element_text(hjust = 0.5, family = "serif"),
    panel.grid = element_blank(),
    axis.line = element_line(),
    axis.ticks = element_line(),
    axis.title.y.left = element_text(margin = margin(r = 10)),  
    axis.title.y.right = element_text(margin = margin(l = 10))
  )

1.5 4年卒業率とsemester制導入率の平均推移を同時にプロット

#要約統計量を計算
summary_rates <- master |> 
  group_by(year) |> 
  summarize(fraction_on_semesters = mean(semester, na.rm = TRUE),
            avg_gradrate4yr = mean(gradrate4yr, na.rm = TRUE))


# グラフをプロット
summary_rates |> 
ggplot(aes(x = year)) +
  geom_line(aes(y = fraction_on_semesters, linetype = "Fraction of schools on semesters"), color = "black", size = 0.5) +
  geom_line(aes(y = avg_gradrate4yr + 0.55, linetype = "4-year graduation rate"), color = "black", size = 0.5) +
  scale_y_continuous(
    name = "Fraction of schools on semesters",
    limits = c(0.8, 1),  # Adjust limits as necessary
    sec.axis = sec_axis(~. -0.55, name = "4-year graduation rate" )
  ) +
  scale_linetype_manual(
    values = c("Fraction of schools on semesters" = "solid", 
               "4-year graduation rate" = "dashed")
  ) +
  labs(
    title = "Figure 3. Fraction of Schools on Semesters and Four-Year Graduation Rates",
    x = "Year",
    linetype = "Legend"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, family = "serif"),
    legend.position = "bottom",
    panel.grid = element_blank(),
    axis.line = element_line(),
    axis.ticks = element_line(),
    axis.title.y.left = element_text(margin = margin(r = 10)),  
    axis.title.y.right = element_text(margin = margin(l = 10))
  )

1.6 散布図を作成

# 散布図を作成する関数
create_scatter_plot <- function(data, x_col, y_col) {
  x_col <- enquo(x_col)
  y_col <- enquo(y_col)
  
  ggplot(data, aes(x = !!x_col, y = !!y_col)) +
    geom_point(color = "blue", alpha = 0.2) +
    labs(x = quo_name(x_col), y = quo_name(y_col)) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5, family = "serif"))
}

plot1 <- create_scatter_plot(master, gradrate4yr, per_women_cohort) + labs(title = "Figure 4a. Four-Year Graduation Rates vs Female Students Ratio")
plot2 <- create_scatter_plot(master, gradrate4yr, per_white_cohort) + labs(title = "Figure 4b. Four-Year Graduation Rates vs White Students Ratio")
plot3 <- create_scatter_plot(master, gradrate4yr, instatetuition) + labs(title = "Figure 4c. Four-Year Graduation Rates vs In-state Tuition")

2 回帰分析

2.1 次の回帰式を推定

lm(formula = gradrate4yr ~ after, data = master) |> 
  modelsummary()
tinytable_qh4ink4oga6dq0texfss
(1)
(Intercept) 0.251
(0.010)
after 0.031
(0.012)
Num.Obs. 1045
R2 0.006
R2 Adj. 0.005
AIC -577.3
BIC -562.4
Log.Lik. 291.633
RMSE 0.18